library(seqinr)
library(ape)
library(pegas)
library(hierfstat)
library(mmod)
library(adegenet)
library(plyr)
library(strataG)
library(iNEXT)
library(gdm)
library(gdistance)
library(ecodist)
library(dplyr)
library(reshape2)
library(WriteXLS)
library(ggplot2)
library(knitr)
library(vegan)
source("config.R")
source("DIPnet_Stats_Functions.R")
rescale<-function(x){
normalized<-(x-min(x))/(max(x)-min(x))
return(normalized)
}
######################################################################
# 1. Import the IPDB and Fst tables
ipdb<-read.table(ipdb_path,sep="\t",header=T,stringsAsFactors = F,quote="", na.strings=c("NA"," ",""))
#read in geographical regionalizations from Treml
spatial<-read.table(spatial_path, header=T, sep="\t",stringsAsFactors = F, na.strings=c("NA"," ",""), quote="")
#read in geographical regionalizations from Beger
spatial2<-read.table(spatial2_path, header=T,sep="\t", stringsAsFactors = F, na.strings=c("NA"," ",""), quote="")
#read in ABGD groups
abgd<-read.table(abgd_path, header=T, sep="\t", stringsAsFactors = F)
#join spatial
ipdb<-join(ipdb,spatial, by = "IPDB_ID",type = "left")
ipdb<-join(ipdb,spatial2[,c(2,18:24)], by = "IPDB_ID", type = "left")
#join ABGD
ipdb<-join(ipdb,abgd[,c(1,3)], by = "IPDB_ID",type = "left")
# drop hybrids and divergent individuals
ipdb<-ipdb[ipdb$IPDB_ID %in% drops == FALSE, ]
## remove anything not included in the ecoregions scheme (some dolphins, some COTS from Kingman and Madagascar(?), some A. nigros from Kiribati, som C. auriga from Fakareva, hammerheads from Western Australia, and West Africa, and some dolphins from the middle of the eastern tropical pacific
ipdb_ecoregions<-ipdb[-which(is.na(ipdb$ECOREGION)),]
## remove anything that doesn't occur in the 3 Indo-Pacific realms
ipdb_ip<-ipdb_ecoregions[which(ipdb_ecoregions$REALM %in% c("Central Indo-Pacific","Western Indo-Pacific","Eastern Indo-Pacific")),]
amova_ts_path<-"/Users/eric/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_Species/Hierarchical_structure"
amova_abgd_path<-"/Users/eric/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_ESU/Hierarchical_structure"
# Read in Veron barriers
barriers<-read.csv("VeronBarriers.csv",header=F,stringsAsFactors = F)
This is an R Markdown Notebook. I am trying it out as a way to reproducibly document my work on the DIPnet Population Structure Paper.
The overall workflow for this analysis is as follows:
Given the special considerations, we need to run 4 different flavors of AMOVA: 1. WCTheta and TradSpec 2. WCTheta and ABGD 3. PhiST and TradSpec 4. PhiST and ABGD
Thus, we need to loop through all of the regionalizations above, running all 4 flavors of AMOVA.
## Loop through hypotheses, calculating AMOVA
hypotheses<-c("ECOREGION", "PROVINCE","REALM","Bowen","Keith","Kulbicki_r","Kulbicki_b", "VeronDivis")
amova_list<-list()
for(h in hypotheses){
hierstats<-hierarchical.structure.mtDNA.db(ipdb = ipdb_ip,level1 = "sample",level2=h,model="none",ABGD=F,nperm=1)
amova_list[[h]]<-hierstats
}
## Summarize AMOVA results
amovastats<-summarize_AMOVA(amova_list,hypotheses,specieslist=unique(ipdb$Genus_species_locus))
WriteXLS(amovastats,ExcelFileName=file.path(amova_ts_path,"FST_TradSpec_raw_amova_output.Rdata.xlsx"))
save(amova_list,file=file.path(amova_ts_path,"FST_TradSpec_raw_amova_output.Rdata"))
save(amovastats,file=file.path(amova_ts_path,"FST_TradSpec_table_amova_output.Rdata"))
This takes a long time, so we are not running this code in the document, but loading in the results as we go through each consideration
load(file=file.path(amova_ts_path,"FST_TradSpec_table_amova_output.Rdata"))
# Measure support for each hypothesis
#Get the values for each hypothesis for a given criterion - here I use BIC - and rank them for #each species, then choose the "best" hypothesis for each species based on the criterion.
criterion<-"BIC"
# find the maximum number of species from any of the 8 hypotheses
maxlength<-max(sapply(amovastats,function(x) length(x[[1]])))
# create an empty data frame with row names from the hypothesis with the most values
crit_df<-data.frame(setNames(replicate(maxlength,numeric(0), simplify = F),nm=row.names(amovastats[["PROVINCE"]])))
#Loop through the hypotheses, pulling out the values for criterion,transpose it, and then merge these values into the dataframe from the previous hypothesis
for(h in names(amovastats)){
crit_df<-merge(crit_df,t(amovastats[[h]][criterion]),all=T,sort=F)
}
#get the hypothesis names in there
row.names(crit_df)<-names(amovastats)
#rank the hypotheses for each species
crit_rank<-as.data.frame(sapply(crit_df,rank,na.last="keep",ties.method="average"))
row.names(crit_rank)<-names(amovastats)
## remove gsls with more than 3 missing models
crit_rank<-crit_rank[, colSums(is.na(crit_rank)) < nrow(crit_rank)-5]
## choose the best hypothesis or set of hypotheses for each species
best_hypothesis<-sapply(crit_rank,function(x){row.names(crit_rank)[which(x==min(x,na.rm=T)) ]})
## Make a bar graph of best support for each hypothesis among species
barplot<-ggplot(data=as.data.frame(unlist(best_hypothesis)), aes(x=unlist(best_hypothesis) )) + geom_bar(aes(y = (..count..)/sum(..count..))) + labs(x="Hypothesis",y="Proportion of Species", title="Proportional Support for Biogeographic Hypotheses based on BIC")
barplot
# lookup the minimum BIC value for each species (which.min works better here, because it returns only the first instance of the minimum value)
minBIC<-sapply(crit_df,function(x){x[which.min(x)]})
#J&O box 4 eqn 1. scale() seems to be the way to go here, using the minBIC as the centering vector
crit_df_deltaI<-scale(crit_df, center=minBIC, scale=F)
#J&O box 4 eqn 4. numerator, plus make it a data frame
crit_df_deltaI_b<-as.data.frame(exp(-0.5*crit_df_deltaI))
#J&O box 4 eqn 4. denominator
crit_df_deltaI_sums<-sapply(crit_df_deltaI_b,sum,na.rm=T)
# this time use the scale argument of scale to divide each column by the corresponding sum
crit_df_relative_prob<-scale(crit_df_deltaI_b,center=F,scale=crit_df_deltaI_sums)
## Make a heatmap of relative probability of each hypothesis
#melt for ggplot2
relprob<-melt(crit_df_relative_prob)
colnames(relprob)<-c("Hypothesis","Species","Relative_Probability")
#baseplot
rp<-ggplot(relprob,aes(y=Species,x=Hypothesis,fill=Relative_Probability))
#add geom_tile, turn the x-axis elements by 90 degrees, reverse the names on the y-axis, and use a diverging color scheme to highlight hypotheses with >50% rel prob.
rp<-rp+geom_tile()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
ylim(rev(levels(relprob$Species)))+
scale_fill_gradient2(low = "blue", mid = "white",
high = "red", midpoint = 0.5, space = "rgb",
na.value = "grey50", guide = "colourbar")
Non Lab interpolation is deprecated
rp
Code is suppressed for the other 3 examples
Non Lab interpolation is deprecated
Non Lab interpolation is deprecated
Non Lab interpolation is deprecated
By sample
diffstats<-pairwise.structure.mtDNA.db(ipdb=ipdb, gdist = "WC Theta", minseqs = 5, minsamps = 3, mintotalseqs = 0, nrep = 0, num.cores = 2, ABGD = T, regionalization = "sample")
save(diffstats, file="/Users/eric/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_ESU/Pairwise_statistics/DIPnet_structure_sample_WCTheta_ABGD.Rdata")
diffstats<-pairwise.structure.mtDNA.db(ipdb=ipdb, gdist = "Jost D", minseqs = 5, minsamps = 3, mintotalseqs = 0, nrep = 0, num.cores = 2, ABGD = T, regionalization = "sample")
save(diffstats, file="/Users/eric/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_ESU/Pairwise_statistics/DIPnet_structure_sample_JostD_ABGD.Rdata")
load("/Users/eric/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_Species/Pairwise_statistics/sample/DIPnet_structure_sample_WCTHETA.Rdata")
esu_loci <- unique(ipdb$Genus_species_locus)
gdm.full<-list()
solution<-list()
nosolution<-list()
nosolution.full<-list()
stats.full<-data.frame(Species_Locus=character(0),WithRegionsDeviance=numeric(0),WithRegionsExplainedDeviance=numeric(0),NoRegionsDeviance=numeric(0),NoRegionsExplainedDeviance=numeric(0),DeltaDeviance=numeric(0),Pvalue_regions=numeric(0),Pvalue_dist=numeric(0),stringsAsFactors = F)
stats<-data.frame(Species_Locus=character(0),Barrier=character(0),WithBarrierDeviance=numeric(0),WithBarrierExplainedDeviance=numeric(0),ImportanceDistanceWithBarrier=numeric(0),ImportanceBarrierWithBarrier=numeric(0),NoBarrierDeviance=numeric(0),NoBarrierExplainedDeviance=numeric(0),ImportanceDistanceWithoutBarrier=numeric(0),DeltaDeviance=numeric(0),Pvalue_barrier=numeric(0),Pvalue_dist=numeric(0),MRDM.rsquared=numeric(0),MRDM.rsquared.pvalue=numeric(0),MRDM.dist.pvalue=numeric(0), MRDM.barrier.pvalue=numeric(0),stringsAsFactors = F)
nostats<-NULL
barriertests<-data.frame(Species=character(0),Region1=character(0),NumPops1=numeric(0),Region2=character(0),Numpops2=numeric(0), Test=logical(0), Solution=logical(0),stringsAsFactors = F)
k<-0
for(gsl in esu_loci){ #gsl<-"Linckia_laevigata_CO1" "Tridacna_crocea_CO1" "Lutjanus_kasmira_CYB" "Acanthaster_planci_CO1"
cat("\n","\n","\n","Now starting", gsl, "\n")
if(any(is.na(diffstats[[gsl]]))){cat("NAs in FST table, No gdm calculated"); next}
if(diffstats[[gsl]]=="Fewer than 3 sampled populations after filtering. No stats calculated"){nostats<-c(nostats,gsl);next}
#pull out the data for this genus-species-locus (gsl)
sp<-ipdb[which(ipdb$Genus_species_locus==gsl),]
#clean weird backslashes from names
sp$locality<-gsub("\"","",sp$locality)
sp$sample<-paste(sp$locality,round(sp$decimalLatitude, digits=0),round(sp$decimalLongitude, digits=0),sep="_") #sets up a variable that matches the name in Fst table
sp<-sp[order(sp$sample),]
# Not all localities are included in Veron's regionalization (e.g. Guam), so get their names and then zap NAs
nonVeronpops<-unique(sp$sample[is.na(sp$VeronDivis)])
sp<-sp[!is.na(sp$VeronDivis),]
#subsample Fst
gslFST<-diffstats[[gsl]]
#make a matrix out of gslFST
gslFSTm<-as.matrix(gslFST)
gslFSTm[which(gslFSTm > 1)] <- 1 #some values that look like 1.0000 are registering as greater than 1
gslFSTm[which(gslFSTm < 0)] <- 0.0001 #get rid of artifactual negative values
#gslFSTm<-rescale(gslFSTm)
#gslFSTm<-gslFSTm/(1-gslFSTm)
#zap weird slashes in the names
rownames(gslFSTm)<-gsub("\"","",rownames(gslFSTm))
colnames(gslFSTm)<-rownames(gslFSTm)
#zap the same na populations from the list of non existent pops from VeronDivis
if(any(rownames(gslFSTm) %in% nonVeronpops)){
gslFSTm<-gslFSTm[-(which(rownames(gslFSTm) %in% nonVeronpops)),-(which(colnames(gslFSTm) %in% nonVeronpops))]
}
if(length(rownames(gslFSTm))<5){nostats<-c(nostats,gsl);cat("Fewer than 5 sampled populations");next}
#and filter sp based on the localities that have Fst values
sp<-sp[sp$sample %in% rownames(gslFSTm),]
#and vice versa
gslFSTm<- gslFSTm[which(rownames(gslFSTm) %in% unique(sp$sample)),which(rownames(gslFSTm) %in% unique(sp$sample))]
#create a locations data frame that has all the localities plus lats and longs and their Veron region.
locs<-as.data.frame(unique(sp$sample))
names(locs)<-"sample"
#locs$Long<-sp$decimalLongitude[which(locs %in% sp$sample)]
#can't do a unique on sample, lats and longs because some samples have non-unique lats and longs! So I do a join and take the first match.
locs<-join(locs,sp[c("sample","decimalLongitude","decimalLatitude"
,"VeronDivis")], by="sample", match="first")
#sort gslFSTm
gslFSTm<-gslFSTm[order(rownames(gslFSTm)),order(colnames(gslFSTm))]
# convert to data frame with popsample names as first column
gslFSTm<-cbind(sample=locs$sample,as.data.frame(gslFSTm))
######################################################################
# 3. Calculate Great Circle Distance
gcdist_km <- pointDistance(locs[,2:3],lonlat=T)/1000
#symmetricize the matrix
gcdist_km[upper.tri(gcdist_km)]<-t(gcdist_km)[upper.tri(gcdist_km)]
#cbind on the sample names
gcdist_km <- cbind(sample=locs$sample,as.data.frame(gcdist_km))
####################################################################
#4. Full Model
cat("Running Full Model")
regions<-cbind(locs[,c(1,2,3)],VeronDivis=rep_len(1,length(locs[,1])))
if(length(unique(locs$VeronDivis)) > 1){
regions<-cbind(locs[,c(1,2,3)],with(locs, data.frame(model.matrix(~VeronDivis+0))))
}
gslFSTmll<-cbind(locs[,c(2,3)],gslFSTm)
gdm.format.full<-formatsitepair(bioData=gslFSTm, bioFormat=3, predData=regions,XColumn = "decimalLongitude", YColumn = "decimalLatitude", siteColumn="sample", distPreds=list(gcdist_km))
#zap population pairs with 0 geographical distance between them. Troubling.
zaps<-which(gdm.format.full$s2.matrix_1==0)
if(length(zaps)>0) {
gdm.format.full<-gdm.format.full[-zaps,]
}
# run the full model, and the model with only distance
fullgdm<-gdm(gdm.format.full)
distgdm<-gdm(gdm.format.full[,c(1:6,grep("matrix_1",names(gdm.format.full)))])
if(is.null(fullgdm) | is.null(distgdm)){
cat("No solution obtained on Full Model")
nosolution.full[[gsl]]<-gdm.format.full
next}
# pull out stats
#difference in deviance
deltadev.regions<-distgdm$gdmdeviance-fullgdm$gdmdeviance
#percent of null deviance explained by the model with just distance
explaineddev.dist<-distgdm$explained
gdm.regions.deviance<-fullgdm$gdmdeviance
gdm.regions.explained<-fullgdm$explained
gdm.no.regions.deviance<-distgdm$gdmdeviance
gdm.no.regions.explained<-distgdm$explained
##############################################################################
# 4A. Perform Monte-Carlo permutations to develop a null distribution
# of deviance values and determine significance (pvalue)
gdm.format.rand.full<-gdm.format.full
rand.deltas.full<-vector()
rand.explained.full<-vector()
while(length(rand.deltas.full) < 1000) {
gdm.format.rand.full$distance<-sample(gdm.format.rand.full$distance,size=length(gdm.format.rand.full$distance))
gdm.barrier.rand.full<-gdm(gdm.format.rand.full)
gdm.no.barrier.rand.full<-gdm(gdm.format.rand.full[,c(1:6,grep("matrix_1",
names(gdm.format.rand.full)))])
# if no solution obtained, go to next gsl
if(is.null(gdm.barrier.rand.full) | is.null(gdm.no.barrier.rand.full)){next}
deltadev.rand.full<-gdm.no.barrier.rand.full$gdmdeviance-gdm.barrier.rand.full$gdmdeviance
explained.rand.full<-gdm.no.barrier.rand.full$explained
rand.deltas.full<-c(rand.deltas.full,deltadev.rand.full)
rand.explained.full<-c(rand.explained.full,explained.rand.full)
}
pvalue_regions.full<-length(which(abs(deltadev.regions) < abs(rand.deltas.full)))/length(rand.deltas.full)
pvalue_dist.full<-length(which(abs(explaineddev.dist) < abs(rand.explained.full)))/length(rand.explained.full)
stats.1<-c(gsl,gdm.regions.deviance,gdm.regions.explained,gdm.no.regions.deviance,gdm.no.regions.explained,deltadev.regions,pvalue_regions.full,pvalue_dist.full)
stats.full[nrow(stats.full)+1,]<-stats.1
gdm.full[[gsl]]<-fullgdm
}
####################################################################### Calculate Overwater Distances#
#Save for later##
#######################################################################
# 5. Create a subset of the distance matrices including only the localities from
# two neighboring Veron regions.
#make a table to keep track of all these tests for each species
for(j in 1:16){
k<-k+1
barrier<-c(barriers[j,1],barriers[j,2])
subset_locs<-which(locs$VeronDivis==barrier[1] | locs$VeronDivis==barrier[2])
locs2<-locs[subset_locs,]
Numpops1<-length(locs2$sample[which(locs$VeronDivis==barrier[1])])
Numpops2<-length(locs2$sample[which(locs$VeronDivis==barrier[2])])
cat("Now Starting",barrier,"\n")
gcdist_km2<-gcdist_km[subset_locs,c(1,subset_locs+1)]
gslFSTm2<-gslFSTm[subset_locs,c(1,subset_locs+1)]
#######################################################################
# 6. Create a dummy distance matrix for each putative "barrier"
# between the two regions (1s and 0s)
barrier_m2<-as.matrix(dist(as.numeric(locs2$VeronDivis %in% barrier[1])))
barrier_m2 <- cbind(sample=locs2$sample,as.data.frame(barrier_m2))
#if there aren't enough samples on either side of this barrier, then record this as non testable and go to next barrier
if(Numpops1+Numpops2 < 3){cat("Less than three sampled populations; not testable\n"); barriertests[k,]<-c(gsl,barrier[1],Numpops1,barrier[2],Numpops2,F,F);next}
if( Numpops1 < 1) {cat("not enough populations within",barrier[1],"\n") ; barriertests[k,]<-c(gsl,barrier[1],Numpops1,barrier[2],Numpops2,F,F);next}
if( Numpops2 < 1) {cat("not enough populations within",barrier[2],"\n") ; barriertests[k,]<-c(gsl,barrier[1],Numpops1,barrier[2],Numpops2,F,F);next}
############################################################################
# 7. Run through gdm with the barrier and without. Save the deviance values.
locs2$sample<-as.character(locs2$sample)
gslFSTm2$sample<-as.character(gslFSTm2$sample)
gcdist_km2$sample<-as.character(gcdist_km2$sample)
gdm.format<-formatsitepair(bioData=gslFSTm2, bioFormat=3, predData=locs2[,1:3],XColumn = "decimalLongitude", YColumn = "decimalLatitude", siteColumn="sample", distPreds=list(gcdist_km2,barrier_m2))
#zap population pairs with 0 geographical distance between them. Troubling.
zaps<-which(gdm.format$s2.matrix_1==0)
if(length(zaps)>0) {
gdm.format<-gdm.format[-zaps,]
}
#run gdm with and without the barrier
gdm.barrier<-gdm(gdm.format)
gdm.no.barrier<-gdm(gdm.format[,-grep("matrix_2",names(gdm.format))])
# run mrdm
mrdm<-MRM(formula = distance ~ s2.matrix_1 + s2.matrix_2, data=gdm.format,nperm = 10000)
#TROUBLESHOOTING: save fst matrices from gdm models that obtain no solution
if(is.null(gdm.barrier) | is.null(gdm.no.barrier))
{cat("No Solution Obtained \n");
nosolution[[paste(gsl,barrier[1],Numpops1,barrier[2],Numpops2,sep=",")]]<-list(locs2,gcdist_km2,gslFSTm2,gdm.format);
barriertests[k,]<-c(gsl,barrier[1],Numpops1,barrier[2],Numpops2,T,F);
next}
#difference in deviance
deltadev<-gdm.no.barrier$gdmdeviance-gdm.barrier$gdmdeviance
#percent of null deviance explained by the model with just distance
explaineddev<-gdm.no.barrier$explained
##############################################################################
# 8. Perform Monte-Carlo permutations to develop a null distribution
# of deviance values and determine significance (pvalue)
gdm.format.rand<-gdm.format
rand.deltas<-vector()
rand.explained<-vector()
while(length(rand.deltas) < 1000) {
gdm.format.rand$distance<-sample(gdm.format.rand$distance,size=length(gdm.format.rand$distance))
gdm.barrier.rand<-gdm(gdm.format.rand)
gdm.no.barrier.rand<-gdm(gdm.format.rand[,-grep("matrix_2",
names(gdm.format))])
if(is.null(gdm.barrier.rand) | is.null(gdm.no.barrier.rand)){next}
deltadev.rand<-gdm.no.barrier.rand$gdmdeviance-gdm.barrier.rand$gdmdeviance
explained.rand<-gdm.no.barrier.rand$explained
rand.deltas<-c(rand.deltas,deltadev.rand)
rand.explained<-c(rand.explained,explained.rand)
}
pvalue_barrier<-length(which(abs(deltadev) < abs(rand.deltas)))/length(rand.deltas)
pvalue_dist<-length(which(abs(explaineddev) < abs(rand.explained)))/length(rand.explained)
cat("Good Solution \n")
#save stats
gdm.barrier.deviance<-gdm.barrier$gdmdeviance
gdm.barrier.explained<-gdm.barrier$explained
gdm.no.barrier.deviance<-gdm.no.barrier$gdmdeviance
gdm.no.barrier.explained<-gdm.no.barrier$explained
impt.dist.gdm.barrier<-sum(gdm.barrier$coefficients[1:gdm.barrier$splines[1]])
impt.barrier.gdm.barrier<-sum(gdm.barrier$coefficients[gdm.barrier$splines[1]+1:gdm.barrier$splines[1]])
impt.dist.gdm.no.barrier<-sum(gdm.no.barrier$coefficients[1:gdm.no.barrier$splines[1]])
mrdm.rsquared<-mrdm$r.squared[1]
mrdm.rsquared.pvalue<-mrdm$r.squared[2]
mrdm.dist.pvalue<-mrdm$coef[2,2]
mrdm.barrier.pvalue<-mrdm$coef[3,2]
stats_model<-c(gsl,paste(barrier[1],barrier[2],sep="-"),gdm.barrier.deviance,gdm.barrier.explained, impt.dist.gdm.barrier, impt.barrier.gdm.barrier, gdm.no.barrier.deviance, gdm.no.barrier.explained,impt.dist.gdm.no.barrier, deltadev,pvalue_barrier,pvalue_dist,mrdm.rsquared,mrdm.rsquared.pvalue,mrdm.dist.pvalue,mrdm.barrier.pvalue)
stats[nrow(stats)+1,]<-stats_model
#record this in the table
barriertests[k,]<-c(gsl,barrier[1],Numpops1,barrier[2],Numpops2,T,T)
}
stats.full$GDM.regions.qvalue<-p.adjust(as.numeric(stats.full$Pvalue_regions),method="fdr")
stats.full$GDM.distance.qvalue<-p.adjust(as.numeric(stats.full$Pvalue_dist),method="fdr")
stats$GDM.qvalue<-p.adjust(as.numeric(stats$Pvalue_barrier),method="fdr")
stats$MRDM.r2.qvalue<-p.adjust(as.numeric(stats$MRDM.rsquared.pvalue),method="fdr")
stats$MRDM.dist.qvalue<-p.adjust(as.numeric(stats$MRDM.dist.pvalue),method="fdr")
stats$MRDM.barrier.qvalue<-p.adjust(as.numeric(stats$MRDM.barrier.pvalue),method="fdr")
# do some figuring with the results
length(barriertests[which(barriertests$Test==T),1])
#working GDM tests
length(barriertests[which(barriertests$Solution==T & barriertests$Test==T),1])
#failed GDM tests
length(barriertests[which(barriertests$Solution==F & barriertests$Test==T),1])
length(stats[which(stats$GDM.qvalue <= 0.02),1])
length(stats[which(stats$MRDM.barrier.qvalue <= 0.02),1])
#overlap between MRDM and GDM
length(stats[which(stats$GDM.qvalue <= 0.02 & stats$MRDM.barrier.qvalue <= 0.02),1])
goodbarriers<-stats %>% filter(GDM.qvalue < 0.02) %>% group_by(Barrier) %>% summarize(goodbarriers = n())
allbarriers<-stats %>% group_by(Barrier) %>% summarize(barrier_tests = n())
barrier_ratios<-left_join(allbarriers,goodbarriers,by="Barrier")
barrier_ratios$goodbarriers[which(is.na(barrier_ratios$goodbarriers))]<-0
barrier_ratios<-mutate(barrier_ratios, goodbarriers/barrier_tests)
kable(barrier_ratios)
write.csv(stats,"./output/GDM_output_WCTheta_1000reps.csv")
write.csv(barrier_ratios, "./output/GDM_JostD_WCTheta_barriers.csv")
# read in the Fst/PhiSt table
load("~/google_drive/DIPnet_Gait_Lig_Bird/DIPnet_WG4_first_papers/statistics/By_Species/Pairwise_statistics/sample/DIPnet_structure_sample_WCTheta.Rdata")
#load("~/Desktop/DIPnet_structure_sample_PhiST_042817.Rdata")
##1. Dataframe for results
stats<-data.frame(Species_Locus=character(0),constrained.inertia=numeric(0),totalInertia=numeric(0),ProportionConstrained=numeric(0),adj.R2.total=numeric(0),modelF=numeric(0),modelPvalue=numeric(0),pcx_Var=numeric(0),pcx_p=numeric(0),pcy_Var=numeric(0),pcy_p=numeric(0),bestmodel=character(0), constrained.inertia.best=numeric(0), total.inertia.best=numeric(0), proportion.constrained.inertia.best=numeric(0), adj.R2.best.model=numeric(0), stringsAsFactors = F)
stats$Species_Locus<-as.character(stats$Species_Locus)
#stats$Barrier<-as.character(stats$Barrier)
stats$bestmodel<-as.character(stats$bestmodel)
# Make an empty list to save rda output for each species
esu_loci <- unique(ipdb$Genus_species_locus)
all.gsl.rda<-sapply(esu_loci, function(x) NULL)
#make empty data frames to save variance and pvalues
term.vars<-data.frame(gsl=character(0))
term.pvals<-data.frame(gsl=character(0))
###############################################################################
# 2. Subsample for each species of interest, and filter based on Phi_ST table.
for(gsl in esu_loci){ #gsl<-"Linckia_laevigata_CO1" "Tridacna_crocea_CO1" "Lutjanus_kasmira_CYB"
cat("\n","\n","\n","Now starting", gsl, "\n")
if(any(is.na(diffstats[[gsl]]))){cat("NAs in FST table, No dbRDA calculated"); next}
if(diffstats[[gsl]]=="Fewer than 3 sampled populations after filtering. No stats calculated"){all.gsl.rda[[gsl]]<-"Fewer than 5 sampled populations after filtering."; cat("Fewer than 5 sampled populations after filtering.");next}
sp<-ipdb[which(ipdb$Genus_species_locus==gsl),]
#clean weird backslashes from names
sp$locality<-gsub("\"","",sp$locality)
sp$sample<-paste(sp$locality,round(sp$decimalLatitude, digits=0),round(sp$decimalLongitude, digits=0),sep="_") #sets up a variable that matches the name in Fst table
sp<-sp[order(sp$sample),]
# Not all localities are included in Veron's regionalization (e.g. Guam), so get their names and then zap NAs
nonVeronpops<-unique(sp$sample[is.na(sp$VeronDivis)])
sp<-sp[!is.na(sp$VeronDivis),]
#subsample Fst
gslFST<-diffstats[[gsl]]
#make a matrix out of gslFST, convert negative values to zero
gslFSTm<-as.matrix(gslFST)
gslFSTm[which(gslFSTm<0)]<-0.0
#zap weird slashes in the names
rownames(gslFSTm)<-gsub("\"","",rownames(gslFSTm))
colnames(gslFSTm)<-rownames(gslFSTm)
#zap the same na populations from the list of non existent pops from VeronDivis
if(any(rownames(gslFSTm) %in% nonVeronpops)){
gslFSTm<-gslFSTm[-(which(rownames(gslFSTm) %in% nonVeronpops)),-(which(colnames(gslFSTm) %in% nonVeronpops))]
}
if(length(rownames(gslFSTm))<5){all.gsl.rda[[gsl]]<-"Fewer than 5 sampled populations";cat("Fewer than 5 sampled populations");next}
#and filter sp based on the localities that have Fst values
sp<-sp[sp$sample %in% rownames(gslFSTm),]
#and vice versa
gslFSTm<- gslFSTm[which(rownames(gslFSTm) %in% unique(sp$sample)),which(rownames(gslFSTm) %in% unique(sp$sample))]
if(length(rownames(gslFSTm))<5){all.gsl.rda[[gsl]]<-"Fewer than 5 sampled populations";cat("Fewer than 5 sampled populations");next}
#create a locations data frame that has all the localities plus lats and longs and their Veron region.
locs<-as.data.frame(unique(sp$sample))
names(locs)<-"sample"
#locs$Long<-sp$decimalLongitude[which(locs %in% sp$sample)]
#can't do a unique on sample, lats and longs because some samples have non-unique lats and longs! So I do a join and take the first match.
locs<-join(locs,sp[c("sample","decimalLongitude","decimalLatitude"
,"VeronDivis")], by="sample", match="first")
if (length(unique(locs$VeronDivis)) < 2) {"Only one region!"; cat("Only one region!"); next}
#sort gslFSTm
gslFSTm<-gslFSTm[order(rownames(gslFSTm)),order(colnames(gslFSTm))]
######################################################################
# 4. Calculate Great Circle Distance
gcdist_km <- pointDistance(locs[,2:3],lonlat=T)/1000
# and symmetricise it
gcdist_km[upper.tri(gcdist_km)]<-0
gcdist_km<-gcdist_km + t(gcdist_km)
####################################################################### Calculate Overwater Distances#
#Save for later##
##############################################################################
#5. Create a matrix of regional identities
regions<-with(locs, data.frame(model.matrix(~VeronDivis+0))) #one of the predictors is superfluous but will get knocked out during RDA
row.names(regions)<-row.names(locs)
############################################################################
# 7. Calculate the principal coordinates
FST.pcoa<-cmdscale(gslFSTm, k=dim(as.matrix(gslFSTm))[1] - 1, eig=TRUE, add=FALSE) #ignore warnings - OK to have negatives according to Anderson
FST.scores<-FST.pcoa$points
gcdist.pcoa<-cmdscale(gcdist_km, k=2, eig=TRUE, add=FALSE)
gcdist.scores<-gcdist.pcoa$points
gcdist.scores<-data.frame("pcx"=gcdist.pcoa$points[,1],"pcy"=gcdist.pcoa$points[,2])
locs2<-cbind(regions,gcdist.scores)
###########################################################################
# 8. Calculate the RDA and extract statistics
RDA.res<-rda(FST.scores~., data=locs2, scale=TRUE )
#Extract Statistics
constrained.inertia<-summary(RDA.res)$constr.chi
total.inertia<-summary(RDA.res)$tot.chi
proportion.constrained.inertia<-constrained.inertia/total.inertia
adj.R2.total.model<-RsquareAdj(RDA.res)$adj.r.squared
if(is.na(adj.R2.total.model)){cat("Predictors >= Observations! No solution");all.gsl.rda[[gsl]]<-"Predictors >= Observations! No solution";next}
model.sig<-anova.cca(RDA.res, step=1000)
modelF<-model.sig$F[1]
modelPvalue<-model.sig$`Pr(>F)`[1]
terms.sig<-anova(RDA.res, by="term", step=1000)
pcx_Var<-terms.sig$Variance[1]
pcx_p<-terms.sig$`Pr(>F)`[1]
pcy_Var<-terms.sig$Variance[2]
pcy_p<-terms.sig$`Pr(>F)`[2]
#extract all variance values into a dataframe
term.var<-as.data.frame(t(terms.sig[,2]))
names(term.var)<-rownames(terms.sig)
#extract all p-values into a dataframe
term.pval<-as.data.frame(t(terms.sig[,4]))
names(term.pval)<-rownames(terms.sig)
# scale the variance
term.var<-term.var/sum(term.var)
# set variance to zero if it is not significant
term.var[1,which(term.pval[1,] > 0.05 | is.na(term.pval[1,]))]<-0
#set all values to zero if the model itself is not significant
if(modelPvalue > 0.05){
term.var[1,which(names(term.var)!="gsl")]<-0
}
# add on the species name
term.pval$gsl<-gsl
term.var$gsl<-gsl
#merge them on to each dataframe
term.vars<-merge(term.vars,term.var,all=T)
term.pvals<-merge(term.pvals,term.pval,all=T)
#barrier_Var<-terms.sig$Variance[3] #omitting for now as the number of barriers will differ
#barrier_p<-terms.sig$`Pr(>F)`[3]
#marg.sig<-anova(RDA.res, by="margin", step=1000)
#barrier_margVar<-marg.sig$Variance[3]
#barrier_margp<-marg.sig$`Pr(>F)`[3]
#Model selection
nullmodel<-rda(FST.scores~1, data=locs2, scale=TRUE )
forward.model<-ordiR2step(nullmodel, scope=formula(RDA.res), directon="forward", psteps=1000) #ordiR2step implements Blanchets stopping criterion
bestmodel<-as.character(forward.model$call[2])
if (is.null(summary(forward.model)$constr.chi)) {
constrained.inertia.best<-NA
total.inertia.best<-summary(forward.model)$tot.chi
proportion.constrained.inertia.best<-NA
adj.R2.best.model<-NA
} else {
constrained.inertia.best<-summary(forward.model)$constr.chi
total.inertia.best<-summary(forward.model)$tot.chi
proportion.constrained.inertia.best<-constrained.inertia.best/total.inertia.best
adj.R2.best.model<-RsquareAdj(forward.model)$adj.r.squared
}
#save stats
stats_model<-c(gsl,constrained.inertia,total.inertia,proportion.constrained.inertia,adj.R2.total.model,modelF,modelPvalue,pcx_Var,pcx_p,pcy_Var,pcy_p, bestmodel, constrained.inertia.best, total.inertia.best, proportion.constrained.inertia.best, adj.R2.best.model)
stats[nrow(stats)+1,]<-stats_model
all.gsl.rda[[gsl]]<-RDA.res
}
write.csv(term.vars,file="./output/dbRDA_term_vars_WCTHETA.csv")
write.csv(term.pvals,file="./output/dbRDA_term_pvals_WCTHETA.csv")
save(all.gsl.rda,file="./output/multibarrier_dbRDA_WCTHETA.Rdata")
write.csv(stats, "./output/multibarrier_dbRDA_JOSTD.csv")